{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "out = 'bert-base-bahasa-standard-cased'\n",
    "os.makedirs(out, exist_ok=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "!rm -rf bert-base-bahasa-standard-cased"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "directory = out"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import BertTokenizer, BertModel, BertConfig, AutoTokenizer, AutoModelWithLMHead, pipeline, BertForMaskedLM"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('bert-base-bahasa-standard-cased/vocab.txt',\n",
       " 'bert-base-bahasa-standard-cased/special_tokens_map.json',\n",
       " 'bert-base-bahasa-standard-cased/added_tokens.json')"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer = BertTokenizer('BERT.wordpiece', do_lower_case = False)\n",
    "tokenizer.save_pretrained('bert-base-bahasa-standard-cased')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = BertTokenizer.from_pretrained('./bert-base-bahasa-standard-cased', do_lower_case = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Building PyTorch model from configuration: BertConfig {\n",
      "  \"attention_probs_dropout_prob\": 0.1,\n",
      "  \"directionality\": \"bidi\",\n",
      "  \"gradient_checkpointing\": false,\n",
      "  \"hidden_act\": \"gelu\",\n",
      "  \"hidden_dropout_prob\": 0.1,\n",
      "  \"hidden_size\": 768,\n",
      "  \"initializer_range\": 0.02,\n",
      "  \"intermediate_size\": 3072,\n",
      "  \"layer_norm_eps\": 1e-12,\n",
      "  \"max_position_embeddings\": 512,\n",
      "  \"model_type\": \"bert\",\n",
      "  \"num_attention_heads\": 12,\n",
      "  \"num_hidden_layers\": 12,\n",
      "  \"pad_token_id\": 0,\n",
      "  \"pooler_fc_size\": 768,\n",
      "  \"pooler_num_attention_heads\": 12,\n",
      "  \"pooler_num_fc_layers\": 3,\n",
      "  \"pooler_size_per_head\": 128,\n",
      "  \"pooler_type\": \"first_token_transform\",\n",
      "  \"type_vocab_size\": 2,\n",
      "  \"vocab_size\": 32000\n",
      "}\n",
      "\n",
      "Converting TensorFlow checkpoint from /home/husein/bert-standard/bert-base/model.ckpt-500000\n",
      "Loading TF weight bert/embeddings/LayerNorm/beta with shape [768]\n",
      "Loading TF weight bert/embeddings/LayerNorm/beta/adam_m with shape [768]\n",
      "Loading TF weight bert/embeddings/LayerNorm/beta/adam_v with shape [768]\n",
      "Loading TF weight bert/embeddings/LayerNorm/gamma with shape [768]\n",
      "Loading TF weight bert/embeddings/LayerNorm/gamma/adam_m with shape [768]\n",
      "Loading TF weight bert/embeddings/LayerNorm/gamma/adam_v with shape [768]\n",
      "Loading TF weight bert/embeddings/position_embeddings with shape [512, 768]\n",
      "Loading TF weight bert/embeddings/position_embeddings/adam_m with shape [512, 768]\n",
      "Loading TF weight bert/embeddings/position_embeddings/adam_v with shape [512, 768]\n",
      "Loading TF weight bert/embeddings/token_type_embeddings with shape [2, 768]\n",
      "Loading TF weight bert/embeddings/token_type_embeddings/adam_m with shape [2, 768]\n",
      "Loading TF weight bert/embeddings/token_type_embeddings/adam_v with shape [2, 768]\n",
      "Loading TF weight bert/embeddings/word_embeddings with shape [32000, 768]\n",
      "Loading TF weight bert/embeddings/word_embeddings/adam_m with shape [32000, 768]\n",
      "Loading TF weight bert/embeddings/word_embeddings/adam_v with shape [32000, 768]\n",
      "Loading TF weight bert/encoder/layer_0/attention/output/LayerNorm/beta with shape [768]\n",
      "Loading TF weight bert/encoder/layer_0/attention/output/LayerNorm/beta/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_0/attention/output/LayerNorm/beta/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_0/attention/output/LayerNorm/gamma with shape [768]\n",
      "Loading TF weight bert/encoder/layer_0/attention/output/LayerNorm/gamma/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_0/attention/output/LayerNorm/gamma/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_0/attention/output/dense/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_0/attention/output/dense/bias/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_0/attention/output/dense/bias/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_0/attention/output/dense/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_0/attention/output/dense/kernel/adam_m with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_0/attention/output/dense/kernel/adam_v with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_0/attention/self/key/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_0/attention/self/key/bias/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_0/attention/self/key/bias/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_0/attention/self/key/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_0/attention/self/key/kernel/adam_m with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_0/attention/self/key/kernel/adam_v with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_0/attention/self/query/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_0/attention/self/query/bias/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_0/attention/self/query/bias/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_0/attention/self/query/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_0/attention/self/query/kernel/adam_m with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_0/attention/self/query/kernel/adam_v with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_0/attention/self/value/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_0/attention/self/value/bias/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_0/attention/self/value/bias/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_0/attention/self/value/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_0/attention/self/value/kernel/adam_m with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_0/attention/self/value/kernel/adam_v with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_0/intermediate/dense/bias with shape [3072]\n",
      "Loading TF weight bert/encoder/layer_0/intermediate/dense/bias/adam_m with shape [3072]\n",
      "Loading TF weight bert/encoder/layer_0/intermediate/dense/bias/adam_v with shape [3072]\n",
      "Loading TF weight bert/encoder/layer_0/intermediate/dense/kernel with shape [768, 3072]\n",
      "Loading TF weight bert/encoder/layer_0/intermediate/dense/kernel/adam_m with shape [768, 3072]\n",
      "Loading TF weight bert/encoder/layer_0/intermediate/dense/kernel/adam_v with shape [768, 3072]\n",
      "Loading TF weight bert/encoder/layer_0/output/LayerNorm/beta with shape [768]\n",
      "Loading TF weight bert/encoder/layer_0/output/LayerNorm/beta/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_0/output/LayerNorm/beta/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_0/output/LayerNorm/gamma with shape [768]\n",
      "Loading TF weight bert/encoder/layer_0/output/LayerNorm/gamma/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_0/output/LayerNorm/gamma/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_0/output/dense/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_0/output/dense/bias/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_0/output/dense/bias/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_0/output/dense/kernel with shape [3072, 768]\n",
      "Loading TF weight bert/encoder/layer_0/output/dense/kernel/adam_m with shape [3072, 768]\n",
      "Loading TF weight bert/encoder/layer_0/output/dense/kernel/adam_v with shape [3072, 768]\n",
      "Loading TF weight bert/encoder/layer_1/attention/output/LayerNorm/beta with shape [768]\n",
      "Loading TF weight bert/encoder/layer_1/attention/output/LayerNorm/beta/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_1/attention/output/LayerNorm/beta/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_1/attention/output/LayerNorm/gamma with shape [768]\n",
      "Loading TF weight bert/encoder/layer_1/attention/output/LayerNorm/gamma/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_1/attention/output/LayerNorm/gamma/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_1/attention/output/dense/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_1/attention/output/dense/bias/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_1/attention/output/dense/bias/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_1/attention/output/dense/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_1/attention/output/dense/kernel/adam_m with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_1/attention/output/dense/kernel/adam_v with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_1/attention/self/key/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_1/attention/self/key/bias/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_1/attention/self/key/bias/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_1/attention/self/key/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_1/attention/self/key/kernel/adam_m with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_1/attention/self/key/kernel/adam_v with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_1/attention/self/query/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_1/attention/self/query/bias/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_1/attention/self/query/bias/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_1/attention/self/query/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_1/attention/self/query/kernel/adam_m with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_1/attention/self/query/kernel/adam_v with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_1/attention/self/value/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_1/attention/self/value/bias/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_1/attention/self/value/bias/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_1/attention/self/value/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_1/attention/self/value/kernel/adam_m with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_1/attention/self/value/kernel/adam_v with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_1/intermediate/dense/bias with shape [3072]\n",
      "Loading TF weight bert/encoder/layer_1/intermediate/dense/bias/adam_m with shape [3072]\n",
      "Loading TF weight bert/encoder/layer_1/intermediate/dense/bias/adam_v with shape [3072]\n",
      "Loading TF weight bert/encoder/layer_1/intermediate/dense/kernel with shape [768, 3072]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading TF weight bert/encoder/layer_1/intermediate/dense/kernel/adam_m with shape [768, 3072]\n",
      "Loading TF weight bert/encoder/layer_1/intermediate/dense/kernel/adam_v with shape [768, 3072]\n",
      "Loading TF weight bert/encoder/layer_1/output/LayerNorm/beta with shape [768]\n",
      "Loading TF weight bert/encoder/layer_1/output/LayerNorm/beta/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_1/output/LayerNorm/beta/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_1/output/LayerNorm/gamma with shape [768]\n",
      "Loading TF weight bert/encoder/layer_1/output/LayerNorm/gamma/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_1/output/LayerNorm/gamma/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_1/output/dense/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_1/output/dense/bias/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_1/output/dense/bias/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_1/output/dense/kernel with shape [3072, 768]\n",
      "Loading TF weight bert/encoder/layer_1/output/dense/kernel/adam_m with shape [3072, 768]\n",
      "Loading TF weight bert/encoder/layer_1/output/dense/kernel/adam_v with shape [3072, 768]\n",
      "Loading TF weight bert/encoder/layer_10/attention/output/LayerNorm/beta with shape [768]\n",
      "Loading TF weight bert/encoder/layer_10/attention/output/LayerNorm/beta/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_10/attention/output/LayerNorm/beta/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_10/attention/output/LayerNorm/gamma with shape [768]\n",
      "Loading TF weight bert/encoder/layer_10/attention/output/LayerNorm/gamma/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_10/attention/output/LayerNorm/gamma/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_10/attention/output/dense/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_10/attention/output/dense/bias/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_10/attention/output/dense/bias/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_10/attention/output/dense/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_10/attention/output/dense/kernel/adam_m with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_10/attention/output/dense/kernel/adam_v with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_10/attention/self/key/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_10/attention/self/key/bias/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_10/attention/self/key/bias/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_10/attention/self/key/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_10/attention/self/key/kernel/adam_m with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_10/attention/self/key/kernel/adam_v with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_10/attention/self/query/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_10/attention/self/query/bias/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_10/attention/self/query/bias/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_10/attention/self/query/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_10/attention/self/query/kernel/adam_m with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_10/attention/self/query/kernel/adam_v with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_10/attention/self/value/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_10/attention/self/value/bias/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_10/attention/self/value/bias/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_10/attention/self/value/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_10/attention/self/value/kernel/adam_m with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_10/attention/self/value/kernel/adam_v with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_10/intermediate/dense/bias with shape [3072]\n",
      "Loading TF weight bert/encoder/layer_10/intermediate/dense/bias/adam_m with shape [3072]\n",
      "Loading TF weight bert/encoder/layer_10/intermediate/dense/bias/adam_v with shape [3072]\n",
      "Loading TF weight bert/encoder/layer_10/intermediate/dense/kernel with shape [768, 3072]\n",
      "Loading TF weight bert/encoder/layer_10/intermediate/dense/kernel/adam_m with shape [768, 3072]\n",
      "Loading TF weight bert/encoder/layer_10/intermediate/dense/kernel/adam_v with shape [768, 3072]\n",
      "Loading TF weight bert/encoder/layer_10/output/LayerNorm/beta with shape [768]\n",
      "Loading TF weight bert/encoder/layer_10/output/LayerNorm/beta/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_10/output/LayerNorm/beta/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_10/output/LayerNorm/gamma with shape [768]\n",
      "Loading TF weight bert/encoder/layer_10/output/LayerNorm/gamma/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_10/output/LayerNorm/gamma/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_10/output/dense/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_10/output/dense/bias/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_10/output/dense/bias/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_10/output/dense/kernel with shape [3072, 768]\n",
      "Loading TF weight bert/encoder/layer_10/output/dense/kernel/adam_m with shape [3072, 768]\n",
      "Loading TF weight bert/encoder/layer_10/output/dense/kernel/adam_v with shape [3072, 768]\n",
      "Loading TF weight bert/encoder/layer_11/attention/output/LayerNorm/beta with shape [768]\n",
      "Loading TF weight bert/encoder/layer_11/attention/output/LayerNorm/beta/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_11/attention/output/LayerNorm/beta/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_11/attention/output/LayerNorm/gamma with shape [768]\n",
      "Loading TF weight bert/encoder/layer_11/attention/output/LayerNorm/gamma/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_11/attention/output/LayerNorm/gamma/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_11/attention/output/dense/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_11/attention/output/dense/bias/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_11/attention/output/dense/bias/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_11/attention/output/dense/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_11/attention/output/dense/kernel/adam_m with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_11/attention/output/dense/kernel/adam_v with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_11/attention/self/key/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_11/attention/self/key/bias/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_11/attention/self/key/bias/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_11/attention/self/key/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_11/attention/self/key/kernel/adam_m with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_11/attention/self/key/kernel/adam_v with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_11/attention/self/query/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_11/attention/self/query/bias/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_11/attention/self/query/bias/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_11/attention/self/query/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_11/attention/self/query/kernel/adam_m with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_11/attention/self/query/kernel/adam_v with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_11/attention/self/value/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_11/attention/self/value/bias/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_11/attention/self/value/bias/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_11/attention/self/value/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_11/attention/self/value/kernel/adam_m with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_11/attention/self/value/kernel/adam_v with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_11/intermediate/dense/bias with shape [3072]\n",
      "Loading TF weight bert/encoder/layer_11/intermediate/dense/bias/adam_m with shape [3072]\n",
      "Loading TF weight bert/encoder/layer_11/intermediate/dense/bias/adam_v with shape [3072]\n",
      "Loading TF weight bert/encoder/layer_11/intermediate/dense/kernel with shape [768, 3072]\n",
      "Loading TF weight bert/encoder/layer_11/intermediate/dense/kernel/adam_m with shape [768, 3072]\n",
      "Loading TF weight bert/encoder/layer_11/intermediate/dense/kernel/adam_v with shape [768, 3072]\n",
      "Loading TF weight bert/encoder/layer_11/output/LayerNorm/beta with shape [768]\n",
      "Loading TF weight bert/encoder/layer_11/output/LayerNorm/beta/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_11/output/LayerNorm/beta/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_11/output/LayerNorm/gamma with shape [768]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading TF weight bert/encoder/layer_11/output/LayerNorm/gamma/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_11/output/LayerNorm/gamma/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_11/output/dense/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_11/output/dense/bias/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_11/output/dense/bias/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_11/output/dense/kernel with shape [3072, 768]\n",
      "Loading TF weight bert/encoder/layer_11/output/dense/kernel/adam_m with shape [3072, 768]\n",
      "Loading TF weight bert/encoder/layer_11/output/dense/kernel/adam_v with shape [3072, 768]\n",
      "Loading TF weight bert/encoder/layer_2/attention/output/LayerNorm/beta with shape [768]\n",
      "Loading TF weight bert/encoder/layer_2/attention/output/LayerNorm/beta/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_2/attention/output/LayerNorm/beta/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_2/attention/output/LayerNorm/gamma with shape [768]\n",
      "Loading TF weight bert/encoder/layer_2/attention/output/LayerNorm/gamma/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_2/attention/output/LayerNorm/gamma/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_2/attention/output/dense/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_2/attention/output/dense/bias/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_2/attention/output/dense/bias/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_2/attention/output/dense/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_2/attention/output/dense/kernel/adam_m with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_2/attention/output/dense/kernel/adam_v with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_2/attention/self/key/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_2/attention/self/key/bias/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_2/attention/self/key/bias/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_2/attention/self/key/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_2/attention/self/key/kernel/adam_m with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_2/attention/self/key/kernel/adam_v with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_2/attention/self/query/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_2/attention/self/query/bias/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_2/attention/self/query/bias/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_2/attention/self/query/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_2/attention/self/query/kernel/adam_m with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_2/attention/self/query/kernel/adam_v with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_2/attention/self/value/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_2/attention/self/value/bias/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_2/attention/self/value/bias/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_2/attention/self/value/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_2/attention/self/value/kernel/adam_m with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_2/attention/self/value/kernel/adam_v with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_2/intermediate/dense/bias with shape [3072]\n",
      "Loading TF weight bert/encoder/layer_2/intermediate/dense/bias/adam_m with shape [3072]\n",
      "Loading TF weight bert/encoder/layer_2/intermediate/dense/bias/adam_v with shape [3072]\n",
      "Loading TF weight bert/encoder/layer_2/intermediate/dense/kernel with shape [768, 3072]\n",
      "Loading TF weight bert/encoder/layer_2/intermediate/dense/kernel/adam_m with shape [768, 3072]\n",
      "Loading TF weight bert/encoder/layer_2/intermediate/dense/kernel/adam_v with shape [768, 3072]\n",
      "Loading TF weight bert/encoder/layer_2/output/LayerNorm/beta with shape [768]\n",
      "Loading TF weight bert/encoder/layer_2/output/LayerNorm/beta/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_2/output/LayerNorm/beta/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_2/output/LayerNorm/gamma with shape [768]\n",
      "Loading TF weight bert/encoder/layer_2/output/LayerNorm/gamma/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_2/output/LayerNorm/gamma/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_2/output/dense/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_2/output/dense/bias/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_2/output/dense/bias/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_2/output/dense/kernel with shape [3072, 768]\n",
      "Loading TF weight bert/encoder/layer_2/output/dense/kernel/adam_m with shape [3072, 768]\n",
      "Loading TF weight bert/encoder/layer_2/output/dense/kernel/adam_v with shape [3072, 768]\n",
      "Loading TF weight bert/encoder/layer_3/attention/output/LayerNorm/beta with shape [768]\n",
      "Loading TF weight bert/encoder/layer_3/attention/output/LayerNorm/beta/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_3/attention/output/LayerNorm/beta/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_3/attention/output/LayerNorm/gamma with shape [768]\n",
      "Loading TF weight bert/encoder/layer_3/attention/output/LayerNorm/gamma/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_3/attention/output/LayerNorm/gamma/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_3/attention/output/dense/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_3/attention/output/dense/bias/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_3/attention/output/dense/bias/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_3/attention/output/dense/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_3/attention/output/dense/kernel/adam_m with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_3/attention/output/dense/kernel/adam_v with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_3/attention/self/key/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_3/attention/self/key/bias/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_3/attention/self/key/bias/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_3/attention/self/key/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_3/attention/self/key/kernel/adam_m with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_3/attention/self/key/kernel/adam_v with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_3/attention/self/query/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_3/attention/self/query/bias/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_3/attention/self/query/bias/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_3/attention/self/query/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_3/attention/self/query/kernel/adam_m with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_3/attention/self/query/kernel/adam_v with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_3/attention/self/value/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_3/attention/self/value/bias/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_3/attention/self/value/bias/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_3/attention/self/value/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_3/attention/self/value/kernel/adam_m with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_3/attention/self/value/kernel/adam_v with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_3/intermediate/dense/bias with shape [3072]\n",
      "Loading TF weight bert/encoder/layer_3/intermediate/dense/bias/adam_m with shape [3072]\n",
      "Loading TF weight bert/encoder/layer_3/intermediate/dense/bias/adam_v with shape [3072]\n",
      "Loading TF weight bert/encoder/layer_3/intermediate/dense/kernel with shape [768, 3072]\n",
      "Loading TF weight bert/encoder/layer_3/intermediate/dense/kernel/adam_m with shape [768, 3072]\n",
      "Loading TF weight bert/encoder/layer_3/intermediate/dense/kernel/adam_v with shape [768, 3072]\n",
      "Loading TF weight bert/encoder/layer_3/output/LayerNorm/beta with shape [768]\n",
      "Loading TF weight bert/encoder/layer_3/output/LayerNorm/beta/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_3/output/LayerNorm/beta/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_3/output/LayerNorm/gamma with shape [768]\n",
      "Loading TF weight bert/encoder/layer_3/output/LayerNorm/gamma/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_3/output/LayerNorm/gamma/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_3/output/dense/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_3/output/dense/bias/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_3/output/dense/bias/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_3/output/dense/kernel with shape [3072, 768]\n",
      "Loading TF weight bert/encoder/layer_3/output/dense/kernel/adam_m with shape [3072, 768]\n",
      "Loading TF weight bert/encoder/layer_3/output/dense/kernel/adam_v with shape [3072, 768]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading TF weight bert/encoder/layer_4/attention/output/LayerNorm/beta with shape [768]\n",
      "Loading TF weight bert/encoder/layer_4/attention/output/LayerNorm/beta/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_4/attention/output/LayerNorm/beta/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_4/attention/output/LayerNorm/gamma with shape [768]\n",
      "Loading TF weight bert/encoder/layer_4/attention/output/LayerNorm/gamma/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_4/attention/output/LayerNorm/gamma/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_4/attention/output/dense/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_4/attention/output/dense/bias/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_4/attention/output/dense/bias/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_4/attention/output/dense/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_4/attention/output/dense/kernel/adam_m with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_4/attention/output/dense/kernel/adam_v with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_4/attention/self/key/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_4/attention/self/key/bias/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_4/attention/self/key/bias/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_4/attention/self/key/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_4/attention/self/key/kernel/adam_m with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_4/attention/self/key/kernel/adam_v with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_4/attention/self/query/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_4/attention/self/query/bias/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_4/attention/self/query/bias/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_4/attention/self/query/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_4/attention/self/query/kernel/adam_m with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_4/attention/self/query/kernel/adam_v with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_4/attention/self/value/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_4/attention/self/value/bias/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_4/attention/self/value/bias/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_4/attention/self/value/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_4/attention/self/value/kernel/adam_m with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_4/attention/self/value/kernel/adam_v with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_4/intermediate/dense/bias with shape [3072]\n",
      "Loading TF weight bert/encoder/layer_4/intermediate/dense/bias/adam_m with shape [3072]\n",
      "Loading TF weight bert/encoder/layer_4/intermediate/dense/bias/adam_v with shape [3072]\n",
      "Loading TF weight bert/encoder/layer_4/intermediate/dense/kernel with shape [768, 3072]\n",
      "Loading TF weight bert/encoder/layer_4/intermediate/dense/kernel/adam_m with shape [768, 3072]\n",
      "Loading TF weight bert/encoder/layer_4/intermediate/dense/kernel/adam_v with shape [768, 3072]\n",
      "Loading TF weight bert/encoder/layer_4/output/LayerNorm/beta with shape [768]\n",
      "Loading TF weight bert/encoder/layer_4/output/LayerNorm/beta/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_4/output/LayerNorm/beta/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_4/output/LayerNorm/gamma with shape [768]\n",
      "Loading TF weight bert/encoder/layer_4/output/LayerNorm/gamma/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_4/output/LayerNorm/gamma/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_4/output/dense/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_4/output/dense/bias/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_4/output/dense/bias/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_4/output/dense/kernel with shape [3072, 768]\n",
      "Loading TF weight bert/encoder/layer_4/output/dense/kernel/adam_m with shape [3072, 768]\n",
      "Loading TF weight bert/encoder/layer_4/output/dense/kernel/adam_v with shape [3072, 768]\n",
      "Loading TF weight bert/encoder/layer_5/attention/output/LayerNorm/beta with shape [768]\n",
      "Loading TF weight bert/encoder/layer_5/attention/output/LayerNorm/beta/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_5/attention/output/LayerNorm/beta/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_5/attention/output/LayerNorm/gamma with shape [768]\n",
      "Loading TF weight bert/encoder/layer_5/attention/output/LayerNorm/gamma/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_5/attention/output/LayerNorm/gamma/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_5/attention/output/dense/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_5/attention/output/dense/bias/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_5/attention/output/dense/bias/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_5/attention/output/dense/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_5/attention/output/dense/kernel/adam_m with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_5/attention/output/dense/kernel/adam_v with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_5/attention/self/key/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_5/attention/self/key/bias/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_5/attention/self/key/bias/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_5/attention/self/key/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_5/attention/self/key/kernel/adam_m with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_5/attention/self/key/kernel/adam_v with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_5/attention/self/query/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_5/attention/self/query/bias/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_5/attention/self/query/bias/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_5/attention/self/query/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_5/attention/self/query/kernel/adam_m with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_5/attention/self/query/kernel/adam_v with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_5/attention/self/value/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_5/attention/self/value/bias/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_5/attention/self/value/bias/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_5/attention/self/value/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_5/attention/self/value/kernel/adam_m with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_5/attention/self/value/kernel/adam_v with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_5/intermediate/dense/bias with shape [3072]\n",
      "Loading TF weight bert/encoder/layer_5/intermediate/dense/bias/adam_m with shape [3072]\n",
      "Loading TF weight bert/encoder/layer_5/intermediate/dense/bias/adam_v with shape [3072]\n",
      "Loading TF weight bert/encoder/layer_5/intermediate/dense/kernel with shape [768, 3072]\n",
      "Loading TF weight bert/encoder/layer_5/intermediate/dense/kernel/adam_m with shape [768, 3072]\n",
      "Loading TF weight bert/encoder/layer_5/intermediate/dense/kernel/adam_v with shape [768, 3072]\n",
      "Loading TF weight bert/encoder/layer_5/output/LayerNorm/beta with shape [768]\n",
      "Loading TF weight bert/encoder/layer_5/output/LayerNorm/beta/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_5/output/LayerNorm/beta/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_5/output/LayerNorm/gamma with shape [768]\n",
      "Loading TF weight bert/encoder/layer_5/output/LayerNorm/gamma/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_5/output/LayerNorm/gamma/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_5/output/dense/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_5/output/dense/bias/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_5/output/dense/bias/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_5/output/dense/kernel with shape [3072, 768]\n",
      "Loading TF weight bert/encoder/layer_5/output/dense/kernel/adam_m with shape [3072, 768]\n",
      "Loading TF weight bert/encoder/layer_5/output/dense/kernel/adam_v with shape [3072, 768]\n",
      "Loading TF weight bert/encoder/layer_6/attention/output/LayerNorm/beta with shape [768]\n",
      "Loading TF weight bert/encoder/layer_6/attention/output/LayerNorm/beta/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_6/attention/output/LayerNorm/beta/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_6/attention/output/LayerNorm/gamma with shape [768]\n",
      "Loading TF weight bert/encoder/layer_6/attention/output/LayerNorm/gamma/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_6/attention/output/LayerNorm/gamma/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_6/attention/output/dense/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_6/attention/output/dense/bias/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_6/attention/output/dense/bias/adam_v with shape [768]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading TF weight bert/encoder/layer_6/attention/output/dense/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_6/attention/output/dense/kernel/adam_m with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_6/attention/output/dense/kernel/adam_v with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_6/attention/self/key/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_6/attention/self/key/bias/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_6/attention/self/key/bias/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_6/attention/self/key/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_6/attention/self/key/kernel/adam_m with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_6/attention/self/key/kernel/adam_v with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_6/attention/self/query/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_6/attention/self/query/bias/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_6/attention/self/query/bias/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_6/attention/self/query/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_6/attention/self/query/kernel/adam_m with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_6/attention/self/query/kernel/adam_v with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_6/attention/self/value/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_6/attention/self/value/bias/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_6/attention/self/value/bias/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_6/attention/self/value/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_6/attention/self/value/kernel/adam_m with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_6/attention/self/value/kernel/adam_v with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_6/intermediate/dense/bias with shape [3072]\n",
      "Loading TF weight bert/encoder/layer_6/intermediate/dense/bias/adam_m with shape [3072]\n",
      "Loading TF weight bert/encoder/layer_6/intermediate/dense/bias/adam_v with shape [3072]\n",
      "Loading TF weight bert/encoder/layer_6/intermediate/dense/kernel with shape [768, 3072]\n",
      "Loading TF weight bert/encoder/layer_6/intermediate/dense/kernel/adam_m with shape [768, 3072]\n",
      "Loading TF weight bert/encoder/layer_6/intermediate/dense/kernel/adam_v with shape [768, 3072]\n",
      "Loading TF weight bert/encoder/layer_6/output/LayerNorm/beta with shape [768]\n",
      "Loading TF weight bert/encoder/layer_6/output/LayerNorm/beta/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_6/output/LayerNorm/beta/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_6/output/LayerNorm/gamma with shape [768]\n",
      "Loading TF weight bert/encoder/layer_6/output/LayerNorm/gamma/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_6/output/LayerNorm/gamma/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_6/output/dense/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_6/output/dense/bias/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_6/output/dense/bias/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_6/output/dense/kernel with shape [3072, 768]\n",
      "Loading TF weight bert/encoder/layer_6/output/dense/kernel/adam_m with shape [3072, 768]\n",
      "Loading TF weight bert/encoder/layer_6/output/dense/kernel/adam_v with shape [3072, 768]\n",
      "Loading TF weight bert/encoder/layer_7/attention/output/LayerNorm/beta with shape [768]\n",
      "Loading TF weight bert/encoder/layer_7/attention/output/LayerNorm/beta/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_7/attention/output/LayerNorm/beta/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_7/attention/output/LayerNorm/gamma with shape [768]\n",
      "Loading TF weight bert/encoder/layer_7/attention/output/LayerNorm/gamma/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_7/attention/output/LayerNorm/gamma/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_7/attention/output/dense/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_7/attention/output/dense/bias/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_7/attention/output/dense/bias/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_7/attention/output/dense/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_7/attention/output/dense/kernel/adam_m with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_7/attention/output/dense/kernel/adam_v with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_7/attention/self/key/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_7/attention/self/key/bias/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_7/attention/self/key/bias/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_7/attention/self/key/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_7/attention/self/key/kernel/adam_m with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_7/attention/self/key/kernel/adam_v with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_7/attention/self/query/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_7/attention/self/query/bias/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_7/attention/self/query/bias/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_7/attention/self/query/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_7/attention/self/query/kernel/adam_m with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_7/attention/self/query/kernel/adam_v with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_7/attention/self/value/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_7/attention/self/value/bias/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_7/attention/self/value/bias/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_7/attention/self/value/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_7/attention/self/value/kernel/adam_m with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_7/attention/self/value/kernel/adam_v with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_7/intermediate/dense/bias with shape [3072]\n",
      "Loading TF weight bert/encoder/layer_7/intermediate/dense/bias/adam_m with shape [3072]\n",
      "Loading TF weight bert/encoder/layer_7/intermediate/dense/bias/adam_v with shape [3072]\n",
      "Loading TF weight bert/encoder/layer_7/intermediate/dense/kernel with shape [768, 3072]\n",
      "Loading TF weight bert/encoder/layer_7/intermediate/dense/kernel/adam_m with shape [768, 3072]\n",
      "Loading TF weight bert/encoder/layer_7/intermediate/dense/kernel/adam_v with shape [768, 3072]\n",
      "Loading TF weight bert/encoder/layer_7/output/LayerNorm/beta with shape [768]\n",
      "Loading TF weight bert/encoder/layer_7/output/LayerNorm/beta/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_7/output/LayerNorm/beta/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_7/output/LayerNorm/gamma with shape [768]\n",
      "Loading TF weight bert/encoder/layer_7/output/LayerNorm/gamma/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_7/output/LayerNorm/gamma/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_7/output/dense/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_7/output/dense/bias/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_7/output/dense/bias/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_7/output/dense/kernel with shape [3072, 768]\n",
      "Loading TF weight bert/encoder/layer_7/output/dense/kernel/adam_m with shape [3072, 768]\n",
      "Loading TF weight bert/encoder/layer_7/output/dense/kernel/adam_v with shape [3072, 768]\n",
      "Loading TF weight bert/encoder/layer_8/attention/output/LayerNorm/beta with shape [768]\n",
      "Loading TF weight bert/encoder/layer_8/attention/output/LayerNorm/beta/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_8/attention/output/LayerNorm/beta/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_8/attention/output/LayerNorm/gamma with shape [768]\n",
      "Loading TF weight bert/encoder/layer_8/attention/output/LayerNorm/gamma/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_8/attention/output/LayerNorm/gamma/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_8/attention/output/dense/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_8/attention/output/dense/bias/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_8/attention/output/dense/bias/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_8/attention/output/dense/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_8/attention/output/dense/kernel/adam_m with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_8/attention/output/dense/kernel/adam_v with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_8/attention/self/key/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_8/attention/self/key/bias/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_8/attention/self/key/bias/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_8/attention/self/key/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_8/attention/self/key/kernel/adam_m with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_8/attention/self/key/kernel/adam_v with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_8/attention/self/query/bias with shape [768]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading TF weight bert/encoder/layer_8/attention/self/query/bias/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_8/attention/self/query/bias/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_8/attention/self/query/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_8/attention/self/query/kernel/adam_m with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_8/attention/self/query/kernel/adam_v with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_8/attention/self/value/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_8/attention/self/value/bias/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_8/attention/self/value/bias/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_8/attention/self/value/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_8/attention/self/value/kernel/adam_m with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_8/attention/self/value/kernel/adam_v with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_8/intermediate/dense/bias with shape [3072]\n",
      "Loading TF weight bert/encoder/layer_8/intermediate/dense/bias/adam_m with shape [3072]\n",
      "Loading TF weight bert/encoder/layer_8/intermediate/dense/bias/adam_v with shape [3072]\n",
      "Loading TF weight bert/encoder/layer_8/intermediate/dense/kernel with shape [768, 3072]\n",
      "Loading TF weight bert/encoder/layer_8/intermediate/dense/kernel/adam_m with shape [768, 3072]\n",
      "Loading TF weight bert/encoder/layer_8/intermediate/dense/kernel/adam_v with shape [768, 3072]\n",
      "Loading TF weight bert/encoder/layer_8/output/LayerNorm/beta with shape [768]\n",
      "Loading TF weight bert/encoder/layer_8/output/LayerNorm/beta/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_8/output/LayerNorm/beta/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_8/output/LayerNorm/gamma with shape [768]\n",
      "Loading TF weight bert/encoder/layer_8/output/LayerNorm/gamma/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_8/output/LayerNorm/gamma/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_8/output/dense/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_8/output/dense/bias/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_8/output/dense/bias/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_8/output/dense/kernel with shape [3072, 768]\n",
      "Loading TF weight bert/encoder/layer_8/output/dense/kernel/adam_m with shape [3072, 768]\n",
      "Loading TF weight bert/encoder/layer_8/output/dense/kernel/adam_v with shape [3072, 768]\n",
      "Loading TF weight bert/encoder/layer_9/attention/output/LayerNorm/beta with shape [768]\n",
      "Loading TF weight bert/encoder/layer_9/attention/output/LayerNorm/beta/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_9/attention/output/LayerNorm/beta/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_9/attention/output/LayerNorm/gamma with shape [768]\n",
      "Loading TF weight bert/encoder/layer_9/attention/output/LayerNorm/gamma/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_9/attention/output/LayerNorm/gamma/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_9/attention/output/dense/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_9/attention/output/dense/bias/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_9/attention/output/dense/bias/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_9/attention/output/dense/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_9/attention/output/dense/kernel/adam_m with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_9/attention/output/dense/kernel/adam_v with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_9/attention/self/key/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_9/attention/self/key/bias/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_9/attention/self/key/bias/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_9/attention/self/key/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_9/attention/self/key/kernel/adam_m with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_9/attention/self/key/kernel/adam_v with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_9/attention/self/query/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_9/attention/self/query/bias/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_9/attention/self/query/bias/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_9/attention/self/query/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_9/attention/self/query/kernel/adam_m with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_9/attention/self/query/kernel/adam_v with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_9/attention/self/value/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_9/attention/self/value/bias/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_9/attention/self/value/bias/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_9/attention/self/value/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_9/attention/self/value/kernel/adam_m with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_9/attention/self/value/kernel/adam_v with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_9/intermediate/dense/bias with shape [3072]\n",
      "Loading TF weight bert/encoder/layer_9/intermediate/dense/bias/adam_m with shape [3072]\n",
      "Loading TF weight bert/encoder/layer_9/intermediate/dense/bias/adam_v with shape [3072]\n",
      "Loading TF weight bert/encoder/layer_9/intermediate/dense/kernel with shape [768, 3072]\n",
      "Loading TF weight bert/encoder/layer_9/intermediate/dense/kernel/adam_m with shape [768, 3072]\n",
      "Loading TF weight bert/encoder/layer_9/intermediate/dense/kernel/adam_v with shape [768, 3072]\n",
      "Loading TF weight bert/encoder/layer_9/output/LayerNorm/beta with shape [768]\n",
      "Loading TF weight bert/encoder/layer_9/output/LayerNorm/beta/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_9/output/LayerNorm/beta/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_9/output/LayerNorm/gamma with shape [768]\n",
      "Loading TF weight bert/encoder/layer_9/output/LayerNorm/gamma/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_9/output/LayerNorm/gamma/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_9/output/dense/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_9/output/dense/bias/adam_m with shape [768]\n",
      "Loading TF weight bert/encoder/layer_9/output/dense/bias/adam_v with shape [768]\n",
      "Loading TF weight bert/encoder/layer_9/output/dense/kernel with shape [3072, 768]\n",
      "Loading TF weight bert/encoder/layer_9/output/dense/kernel/adam_m with shape [3072, 768]\n",
      "Loading TF weight bert/encoder/layer_9/output/dense/kernel/adam_v with shape [3072, 768]\n",
      "Loading TF weight bert/pooler/dense/bias with shape [768]\n",
      "Loading TF weight bert/pooler/dense/bias/adam_m with shape [768]\n",
      "Loading TF weight bert/pooler/dense/bias/adam_v with shape [768]\n",
      "Loading TF weight bert/pooler/dense/kernel with shape [768, 768]\n",
      "Loading TF weight bert/pooler/dense/kernel/adam_m with shape [768, 768]\n",
      "Loading TF weight bert/pooler/dense/kernel/adam_v with shape [768, 768]\n",
      "Loading TF weight cls/predictions/output_bias with shape [32000]\n",
      "Loading TF weight cls/predictions/output_bias/adam_m with shape [32000]\n",
      "Loading TF weight cls/predictions/output_bias/adam_v with shape [32000]\n",
      "Loading TF weight cls/predictions/transform/LayerNorm/beta with shape [768]\n",
      "Loading TF weight cls/predictions/transform/LayerNorm/beta/adam_m with shape [768]\n",
      "Loading TF weight cls/predictions/transform/LayerNorm/beta/adam_v with shape [768]\n",
      "Loading TF weight cls/predictions/transform/LayerNorm/gamma with shape [768]\n",
      "Loading TF weight cls/predictions/transform/LayerNorm/gamma/adam_m with shape [768]\n",
      "Loading TF weight cls/predictions/transform/LayerNorm/gamma/adam_v with shape [768]\n",
      "Loading TF weight cls/predictions/transform/dense/bias with shape [768]\n",
      "Loading TF weight cls/predictions/transform/dense/bias/adam_m with shape [768]\n",
      "Loading TF weight cls/predictions/transform/dense/bias/adam_v with shape [768]\n",
      "Loading TF weight cls/predictions/transform/dense/kernel with shape [768, 768]\n",
      "Loading TF weight cls/predictions/transform/dense/kernel/adam_m with shape [768, 768]\n",
      "Loading TF weight cls/predictions/transform/dense/kernel/adam_v with shape [768, 768]\n",
      "Loading TF weight cls/seq_relationship/output_bias with shape [2]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading TF weight cls/seq_relationship/output_bias/adam_m with shape [2]\r\n",
      "Loading TF weight cls/seq_relationship/output_bias/adam_v with shape [2]\r\n",
      "Loading TF weight cls/seq_relationship/output_weights with shape [2, 768]\r\n",
      "Loading TF weight cls/seq_relationship/output_weights/adam_m with shape [2, 768]\r\n",
      "Loading TF weight cls/seq_relationship/output_weights/adam_v with shape [2, 768]\r\n",
      "Loading TF weight global_step with shape []\r\n",
      "Initialize PyTorch weight ['bert', 'embeddings', 'LayerNorm', 'beta']\r\n",
      "Skipping bert/embeddings/LayerNorm/beta/adam_m\r\n",
      "Skipping bert/embeddings/LayerNorm/beta/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'embeddings', 'LayerNorm', 'gamma']\r\n",
      "Skipping bert/embeddings/LayerNorm/gamma/adam_m\r\n",
      "Skipping bert/embeddings/LayerNorm/gamma/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'embeddings', 'position_embeddings']\r\n",
      "Skipping bert/embeddings/position_embeddings/adam_m\r\n",
      "Skipping bert/embeddings/position_embeddings/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'embeddings', 'token_type_embeddings']\r\n",
      "Skipping bert/embeddings/token_type_embeddings/adam_m\r\n",
      "Skipping bert/embeddings/token_type_embeddings/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'embeddings', 'word_embeddings']\r\n",
      "Skipping bert/embeddings/word_embeddings/adam_m\r\n",
      "Skipping bert/embeddings/word_embeddings/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_0', 'attention', 'output', 'LayerNorm', 'beta']\r\n",
      "Skipping bert/encoder/layer_0/attention/output/LayerNorm/beta/adam_m\r\n",
      "Skipping bert/encoder/layer_0/attention/output/LayerNorm/beta/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_0', 'attention', 'output', 'LayerNorm', 'gamma']\r\n",
      "Skipping bert/encoder/layer_0/attention/output/LayerNorm/gamma/adam_m\r\n",
      "Skipping bert/encoder/layer_0/attention/output/LayerNorm/gamma/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_0', 'attention', 'output', 'dense', 'bias']\r\n",
      "Skipping bert/encoder/layer_0/attention/output/dense/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_0/attention/output/dense/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_0', 'attention', 'output', 'dense', 'kernel']\r\n",
      "Skipping bert/encoder/layer_0/attention/output/dense/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_0/attention/output/dense/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_0', 'attention', 'self', 'key', 'bias']\r\n",
      "Skipping bert/encoder/layer_0/attention/self/key/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_0/attention/self/key/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_0', 'attention', 'self', 'key', 'kernel']\r\n",
      "Skipping bert/encoder/layer_0/attention/self/key/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_0/attention/self/key/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_0', 'attention', 'self', 'query', 'bias']\r\n",
      "Skipping bert/encoder/layer_0/attention/self/query/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_0/attention/self/query/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_0', 'attention', 'self', 'query', 'kernel']\r\n",
      "Skipping bert/encoder/layer_0/attention/self/query/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_0/attention/self/query/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_0', 'attention', 'self', 'value', 'bias']\r\n",
      "Skipping bert/encoder/layer_0/attention/self/value/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_0/attention/self/value/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_0', 'attention', 'self', 'value', 'kernel']\r\n",
      "Skipping bert/encoder/layer_0/attention/self/value/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_0/attention/self/value/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_0', 'intermediate', 'dense', 'bias']\r\n",
      "Skipping bert/encoder/layer_0/intermediate/dense/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_0/intermediate/dense/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_0', 'intermediate', 'dense', 'kernel']\r\n",
      "Skipping bert/encoder/layer_0/intermediate/dense/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_0/intermediate/dense/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_0', 'output', 'LayerNorm', 'beta']\r\n",
      "Skipping bert/encoder/layer_0/output/LayerNorm/beta/adam_m\r\n",
      "Skipping bert/encoder/layer_0/output/LayerNorm/beta/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_0', 'output', 'LayerNorm', 'gamma']\r\n",
      "Skipping bert/encoder/layer_0/output/LayerNorm/gamma/adam_m\r\n",
      "Skipping bert/encoder/layer_0/output/LayerNorm/gamma/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_0', 'output', 'dense', 'bias']\r\n",
      "Skipping bert/encoder/layer_0/output/dense/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_0/output/dense/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_0', 'output', 'dense', 'kernel']\r\n",
      "Skipping bert/encoder/layer_0/output/dense/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_0/output/dense/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_1', 'attention', 'output', 'LayerNorm', 'beta']\r\n",
      "Skipping bert/encoder/layer_1/attention/output/LayerNorm/beta/adam_m\r\n",
      "Skipping bert/encoder/layer_1/attention/output/LayerNorm/beta/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_1', 'attention', 'output', 'LayerNorm', 'gamma']\r\n",
      "Skipping bert/encoder/layer_1/attention/output/LayerNorm/gamma/adam_m\r\n",
      "Skipping bert/encoder/layer_1/attention/output/LayerNorm/gamma/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_1', 'attention', 'output', 'dense', 'bias']\r\n",
      "Skipping bert/encoder/layer_1/attention/output/dense/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_1/attention/output/dense/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_1', 'attention', 'output', 'dense', 'kernel']\r\n",
      "Skipping bert/encoder/layer_1/attention/output/dense/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_1/attention/output/dense/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_1', 'attention', 'self', 'key', 'bias']\r\n",
      "Skipping bert/encoder/layer_1/attention/self/key/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_1/attention/self/key/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_1', 'attention', 'self', 'key', 'kernel']\r\n",
      "Skipping bert/encoder/layer_1/attention/self/key/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_1/attention/self/key/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_1', 'attention', 'self', 'query', 'bias']\r\n",
      "Skipping bert/encoder/layer_1/attention/self/query/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_1/attention/self/query/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_1', 'attention', 'self', 'query', 'kernel']\r\n",
      "Skipping bert/encoder/layer_1/attention/self/query/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_1/attention/self/query/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_1', 'attention', 'self', 'value', 'bias']\r\n",
      "Skipping bert/encoder/layer_1/attention/self/value/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_1/attention/self/value/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_1', 'attention', 'self', 'value', 'kernel']\r\n",
      "Skipping bert/encoder/layer_1/attention/self/value/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_1/attention/self/value/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_1', 'intermediate', 'dense', 'bias']\r\n",
      "Skipping bert/encoder/layer_1/intermediate/dense/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_1/intermediate/dense/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_1', 'intermediate', 'dense', 'kernel']\r\n",
      "Skipping bert/encoder/layer_1/intermediate/dense/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_1/intermediate/dense/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_1', 'output', 'LayerNorm', 'beta']\r\n",
      "Skipping bert/encoder/layer_1/output/LayerNorm/beta/adam_m\r\n",
      "Skipping bert/encoder/layer_1/output/LayerNorm/beta/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_1', 'output', 'LayerNorm', 'gamma']\r\n",
      "Skipping bert/encoder/layer_1/output/LayerNorm/gamma/adam_m\r\n",
      "Skipping bert/encoder/layer_1/output/LayerNorm/gamma/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_1', 'output', 'dense', 'bias']\r\n",
      "Skipping bert/encoder/layer_1/output/dense/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_1/output/dense/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_1', 'output', 'dense', 'kernel']\r\n",
      "Skipping bert/encoder/layer_1/output/dense/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_1/output/dense/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_10', 'attention', 'output', 'LayerNorm', 'beta']\r\n",
      "Skipping bert/encoder/layer_10/attention/output/LayerNorm/beta/adam_m\r\n",
      "Skipping bert/encoder/layer_10/attention/output/LayerNorm/beta/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_10', 'attention', 'output', 'LayerNorm', 'gamma']\r\n",
      "Skipping bert/encoder/layer_10/attention/output/LayerNorm/gamma/adam_m\r\n",
      "Skipping bert/encoder/layer_10/attention/output/LayerNorm/gamma/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_10', 'attention', 'output', 'dense', 'bias']\r\n",
      "Skipping bert/encoder/layer_10/attention/output/dense/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_10/attention/output/dense/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_10', 'attention', 'output', 'dense', 'kernel']\r\n",
      "Skipping bert/encoder/layer_10/attention/output/dense/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_10/attention/output/dense/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_10', 'attention', 'self', 'key', 'bias']\r\n",
      "Skipping bert/encoder/layer_10/attention/self/key/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_10/attention/self/key/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_10', 'attention', 'self', 'key', 'kernel']\r\n",
      "Skipping bert/encoder/layer_10/attention/self/key/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_10/attention/self/key/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_10', 'attention', 'self', 'query', 'bias']\r\n",
      "Skipping bert/encoder/layer_10/attention/self/query/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_10/attention/self/query/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_10', 'attention', 'self', 'query', 'kernel']\r\n",
      "Skipping bert/encoder/layer_10/attention/self/query/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_10/attention/self/query/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_10', 'attention', 'self', 'value', 'bias']\r\n",
      "Skipping bert/encoder/layer_10/attention/self/value/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_10/attention/self/value/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_10', 'attention', 'self', 'value', 'kernel']\r\n",
      "Skipping bert/encoder/layer_10/attention/self/value/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_10/attention/self/value/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_10', 'intermediate', 'dense', 'bias']\r\n",
      "Skipping bert/encoder/layer_10/intermediate/dense/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_10/intermediate/dense/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_10', 'intermediate', 'dense', 'kernel']\r\n",
      "Skipping bert/encoder/layer_10/intermediate/dense/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_10/intermediate/dense/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_10', 'output', 'LayerNorm', 'beta']\r\n",
      "Skipping bert/encoder/layer_10/output/LayerNorm/beta/adam_m\r\n",
      "Skipping bert/encoder/layer_10/output/LayerNorm/beta/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_10', 'output', 'LayerNorm', 'gamma']\r\n",
      "Skipping bert/encoder/layer_10/output/LayerNorm/gamma/adam_m\r\n",
      "Skipping bert/encoder/layer_10/output/LayerNorm/gamma/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_10', 'output', 'dense', 'bias']\r\n",
      "Skipping bert/encoder/layer_10/output/dense/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_10/output/dense/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_10', 'output', 'dense', 'kernel']\r\n",
      "Skipping bert/encoder/layer_10/output/dense/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_10/output/dense/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_11', 'attention', 'output', 'LayerNorm', 'beta']\r\n",
      "Skipping bert/encoder/layer_11/attention/output/LayerNorm/beta/adam_m\r\n",
      "Skipping bert/encoder/layer_11/attention/output/LayerNorm/beta/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_11', 'attention', 'output', 'LayerNorm', 'gamma']\r\n",
      "Skipping bert/encoder/layer_11/attention/output/LayerNorm/gamma/adam_m\r\n",
      "Skipping bert/encoder/layer_11/attention/output/LayerNorm/gamma/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_11', 'attention', 'output', 'dense', 'bias']\r\n",
      "Skipping bert/encoder/layer_11/attention/output/dense/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_11/attention/output/dense/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_11', 'attention', 'output', 'dense', 'kernel']\r\n",
      "Skipping bert/encoder/layer_11/attention/output/dense/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_11/attention/output/dense/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_11', 'attention', 'self', 'key', 'bias']\r\n",
      "Skipping bert/encoder/layer_11/attention/self/key/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_11/attention/self/key/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_11', 'attention', 'self', 'key', 'kernel']\r\n",
      "Skipping bert/encoder/layer_11/attention/self/key/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_11/attention/self/key/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_11', 'attention', 'self', 'query', 'bias']\r\n",
      "Skipping bert/encoder/layer_11/attention/self/query/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_11/attention/self/query/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_11', 'attention', 'self', 'query', 'kernel']\r\n",
      "Skipping bert/encoder/layer_11/attention/self/query/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_11/attention/self/query/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_11', 'attention', 'self', 'value', 'bias']\r\n",
      "Skipping bert/encoder/layer_11/attention/self/value/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_11/attention/self/value/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_11', 'attention', 'self', 'value', 'kernel']\r\n",
      "Skipping bert/encoder/layer_11/attention/self/value/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_11/attention/self/value/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_11', 'intermediate', 'dense', 'bias']\r\n",
      "Skipping bert/encoder/layer_11/intermediate/dense/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_11/intermediate/dense/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_11', 'intermediate', 'dense', 'kernel']\r\n",
      "Skipping bert/encoder/layer_11/intermediate/dense/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_11/intermediate/dense/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_11', 'output', 'LayerNorm', 'beta']\r\n",
      "Skipping bert/encoder/layer_11/output/LayerNorm/beta/adam_m\r\n",
      "Skipping bert/encoder/layer_11/output/LayerNorm/beta/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_11', 'output', 'LayerNorm', 'gamma']\r\n",
      "Skipping bert/encoder/layer_11/output/LayerNorm/gamma/adam_m\r\n",
      "Skipping bert/encoder/layer_11/output/LayerNorm/gamma/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_11', 'output', 'dense', 'bias']\r\n",
      "Skipping bert/encoder/layer_11/output/dense/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_11/output/dense/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_11', 'output', 'dense', 'kernel']\r\n",
      "Skipping bert/encoder/layer_11/output/dense/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_11/output/dense/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_2', 'attention', 'output', 'LayerNorm', 'beta']\r\n",
      "Skipping bert/encoder/layer_2/attention/output/LayerNorm/beta/adam_m\r\n",
      "Skipping bert/encoder/layer_2/attention/output/LayerNorm/beta/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_2', 'attention', 'output', 'LayerNorm', 'gamma']\r\n",
      "Skipping bert/encoder/layer_2/attention/output/LayerNorm/gamma/adam_m\r\n",
      "Skipping bert/encoder/layer_2/attention/output/LayerNorm/gamma/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_2', 'attention', 'output', 'dense', 'bias']\r\n",
      "Skipping bert/encoder/layer_2/attention/output/dense/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_2/attention/output/dense/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_2', 'attention', 'output', 'dense', 'kernel']\r\n",
      "Skipping bert/encoder/layer_2/attention/output/dense/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_2/attention/output/dense/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_2', 'attention', 'self', 'key', 'bias']\r\n",
      "Skipping bert/encoder/layer_2/attention/self/key/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_2/attention/self/key/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_2', 'attention', 'self', 'key', 'kernel']\r\n",
      "Skipping bert/encoder/layer_2/attention/self/key/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_2/attention/self/key/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_2', 'attention', 'self', 'query', 'bias']\r\n",
      "Skipping bert/encoder/layer_2/attention/self/query/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_2/attention/self/query/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_2', 'attention', 'self', 'query', 'kernel']\r\n",
      "Skipping bert/encoder/layer_2/attention/self/query/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_2/attention/self/query/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_2', 'attention', 'self', 'value', 'bias']\r\n",
      "Skipping bert/encoder/layer_2/attention/self/value/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_2/attention/self/value/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_2', 'attention', 'self', 'value', 'kernel']\r\n",
      "Skipping bert/encoder/layer_2/attention/self/value/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_2/attention/self/value/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_2', 'intermediate', 'dense', 'bias']\r\n",
      "Skipping bert/encoder/layer_2/intermediate/dense/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_2/intermediate/dense/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_2', 'intermediate', 'dense', 'kernel']\r\n",
      "Skipping bert/encoder/layer_2/intermediate/dense/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_2/intermediate/dense/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_2', 'output', 'LayerNorm', 'beta']\r\n",
      "Skipping bert/encoder/layer_2/output/LayerNorm/beta/adam_m\r\n",
      "Skipping bert/encoder/layer_2/output/LayerNorm/beta/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_2', 'output', 'LayerNorm', 'gamma']\r\n",
      "Skipping bert/encoder/layer_2/output/LayerNorm/gamma/adam_m\r\n",
      "Skipping bert/encoder/layer_2/output/LayerNorm/gamma/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_2', 'output', 'dense', 'bias']\r\n",
      "Skipping bert/encoder/layer_2/output/dense/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_2/output/dense/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_2', 'output', 'dense', 'kernel']\r\n",
      "Skipping bert/encoder/layer_2/output/dense/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_2/output/dense/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_3', 'attention', 'output', 'LayerNorm', 'beta']\r\n",
      "Skipping bert/encoder/layer_3/attention/output/LayerNorm/beta/adam_m\r\n",
      "Skipping bert/encoder/layer_3/attention/output/LayerNorm/beta/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_3', 'attention', 'output', 'LayerNorm', 'gamma']\r\n",
      "Skipping bert/encoder/layer_3/attention/output/LayerNorm/gamma/adam_m\r\n",
      "Skipping bert/encoder/layer_3/attention/output/LayerNorm/gamma/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_3', 'attention', 'output', 'dense', 'bias']\r\n",
      "Skipping bert/encoder/layer_3/attention/output/dense/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_3/attention/output/dense/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_3', 'attention', 'output', 'dense', 'kernel']\r\n",
      "Skipping bert/encoder/layer_3/attention/output/dense/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_3/attention/output/dense/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_3', 'attention', 'self', 'key', 'bias']\r\n",
      "Skipping bert/encoder/layer_3/attention/self/key/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_3/attention/self/key/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_3', 'attention', 'self', 'key', 'kernel']\r\n",
      "Skipping bert/encoder/layer_3/attention/self/key/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_3/attention/self/key/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_3', 'attention', 'self', 'query', 'bias']\r\n",
      "Skipping bert/encoder/layer_3/attention/self/query/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_3/attention/self/query/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_3', 'attention', 'self', 'query', 'kernel']\r\n",
      "Skipping bert/encoder/layer_3/attention/self/query/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_3/attention/self/query/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_3', 'attention', 'self', 'value', 'bias']\r\n",
      "Skipping bert/encoder/layer_3/attention/self/value/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_3/attention/self/value/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_3', 'attention', 'self', 'value', 'kernel']\r\n",
      "Skipping bert/encoder/layer_3/attention/self/value/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_3/attention/self/value/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_3', 'intermediate', 'dense', 'bias']\r\n",
      "Skipping bert/encoder/layer_3/intermediate/dense/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_3/intermediate/dense/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_3', 'intermediate', 'dense', 'kernel']\r\n",
      "Skipping bert/encoder/layer_3/intermediate/dense/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_3/intermediate/dense/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_3', 'output', 'LayerNorm', 'beta']\r\n",
      "Skipping bert/encoder/layer_3/output/LayerNorm/beta/adam_m\r\n",
      "Skipping bert/encoder/layer_3/output/LayerNorm/beta/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_3', 'output', 'LayerNorm', 'gamma']\r\n",
      "Skipping bert/encoder/layer_3/output/LayerNorm/gamma/adam_m\r\n",
      "Skipping bert/encoder/layer_3/output/LayerNorm/gamma/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_3', 'output', 'dense', 'bias']\r\n",
      "Skipping bert/encoder/layer_3/output/dense/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_3/output/dense/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_3', 'output', 'dense', 'kernel']\r\n",
      "Skipping bert/encoder/layer_3/output/dense/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_3/output/dense/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_4', 'attention', 'output', 'LayerNorm', 'beta']\r\n",
      "Skipping bert/encoder/layer_4/attention/output/LayerNorm/beta/adam_m\r\n",
      "Skipping bert/encoder/layer_4/attention/output/LayerNorm/beta/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_4', 'attention', 'output', 'LayerNorm', 'gamma']\r\n",
      "Skipping bert/encoder/layer_4/attention/output/LayerNorm/gamma/adam_m\r\n",
      "Skipping bert/encoder/layer_4/attention/output/LayerNorm/gamma/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_4', 'attention', 'output', 'dense', 'bias']\r\n",
      "Skipping bert/encoder/layer_4/attention/output/dense/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_4/attention/output/dense/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_4', 'attention', 'output', 'dense', 'kernel']\r\n",
      "Skipping bert/encoder/layer_4/attention/output/dense/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_4/attention/output/dense/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_4', 'attention', 'self', 'key', 'bias']\r\n",
      "Skipping bert/encoder/layer_4/attention/self/key/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_4/attention/self/key/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_4', 'attention', 'self', 'key', 'kernel']\r\n",
      "Skipping bert/encoder/layer_4/attention/self/key/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_4/attention/self/key/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_4', 'attention', 'self', 'query', 'bias']\r\n",
      "Skipping bert/encoder/layer_4/attention/self/query/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_4/attention/self/query/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_4', 'attention', 'self', 'query', 'kernel']\r\n",
      "Skipping bert/encoder/layer_4/attention/self/query/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_4/attention/self/query/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_4', 'attention', 'self', 'value', 'bias']\r\n",
      "Skipping bert/encoder/layer_4/attention/self/value/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_4/attention/self/value/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_4', 'attention', 'self', 'value', 'kernel']\r\n",
      "Skipping bert/encoder/layer_4/attention/self/value/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_4/attention/self/value/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_4', 'intermediate', 'dense', 'bias']\r\n",
      "Skipping bert/encoder/layer_4/intermediate/dense/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_4/intermediate/dense/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_4', 'intermediate', 'dense', 'kernel']\r\n",
      "Skipping bert/encoder/layer_4/intermediate/dense/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_4/intermediate/dense/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_4', 'output', 'LayerNorm', 'beta']\r\n",
      "Skipping bert/encoder/layer_4/output/LayerNorm/beta/adam_m\r\n",
      "Skipping bert/encoder/layer_4/output/LayerNorm/beta/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_4', 'output', 'LayerNorm', 'gamma']\r\n",
      "Skipping bert/encoder/layer_4/output/LayerNorm/gamma/adam_m\r\n",
      "Skipping bert/encoder/layer_4/output/LayerNorm/gamma/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_4', 'output', 'dense', 'bias']\r\n",
      "Skipping bert/encoder/layer_4/output/dense/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_4/output/dense/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_4', 'output', 'dense', 'kernel']\r\n",
      "Skipping bert/encoder/layer_4/output/dense/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_4/output/dense/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_5', 'attention', 'output', 'LayerNorm', 'beta']\r\n",
      "Skipping bert/encoder/layer_5/attention/output/LayerNorm/beta/adam_m\r\n",
      "Skipping bert/encoder/layer_5/attention/output/LayerNorm/beta/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_5', 'attention', 'output', 'LayerNorm', 'gamma']\r\n",
      "Skipping bert/encoder/layer_5/attention/output/LayerNorm/gamma/adam_m\r\n",
      "Skipping bert/encoder/layer_5/attention/output/LayerNorm/gamma/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_5', 'attention', 'output', 'dense', 'bias']\r\n",
      "Skipping bert/encoder/layer_5/attention/output/dense/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_5/attention/output/dense/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_5', 'attention', 'output', 'dense', 'kernel']\r\n",
      "Skipping bert/encoder/layer_5/attention/output/dense/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_5/attention/output/dense/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_5', 'attention', 'self', 'key', 'bias']\r\n",
      "Skipping bert/encoder/layer_5/attention/self/key/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_5/attention/self/key/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_5', 'attention', 'self', 'key', 'kernel']\r\n",
      "Skipping bert/encoder/layer_5/attention/self/key/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_5/attention/self/key/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_5', 'attention', 'self', 'query', 'bias']\r\n",
      "Skipping bert/encoder/layer_5/attention/self/query/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_5/attention/self/query/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_5', 'attention', 'self', 'query', 'kernel']\r\n",
      "Skipping bert/encoder/layer_5/attention/self/query/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_5/attention/self/query/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_5', 'attention', 'self', 'value', 'bias']\r\n",
      "Skipping bert/encoder/layer_5/attention/self/value/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_5/attention/self/value/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_5', 'attention', 'self', 'value', 'kernel']\r\n",
      "Skipping bert/encoder/layer_5/attention/self/value/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_5/attention/self/value/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_5', 'intermediate', 'dense', 'bias']\r\n",
      "Skipping bert/encoder/layer_5/intermediate/dense/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_5/intermediate/dense/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_5', 'intermediate', 'dense', 'kernel']\r\n",
      "Skipping bert/encoder/layer_5/intermediate/dense/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_5/intermediate/dense/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_5', 'output', 'LayerNorm', 'beta']\r\n",
      "Skipping bert/encoder/layer_5/output/LayerNorm/beta/adam_m\r\n",
      "Skipping bert/encoder/layer_5/output/LayerNorm/beta/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_5', 'output', 'LayerNorm', 'gamma']\r\n",
      "Skipping bert/encoder/layer_5/output/LayerNorm/gamma/adam_m\r\n",
      "Skipping bert/encoder/layer_5/output/LayerNorm/gamma/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_5', 'output', 'dense', 'bias']\r\n",
      "Skipping bert/encoder/layer_5/output/dense/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_5/output/dense/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_5', 'output', 'dense', 'kernel']\r\n",
      "Skipping bert/encoder/layer_5/output/dense/kernel/adam_m\r\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Skipping bert/encoder/layer_5/output/dense/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_6', 'attention', 'output', 'LayerNorm', 'beta']\r\n",
      "Skipping bert/encoder/layer_6/attention/output/LayerNorm/beta/adam_m\r\n",
      "Skipping bert/encoder/layer_6/attention/output/LayerNorm/beta/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_6', 'attention', 'output', 'LayerNorm', 'gamma']\r\n",
      "Skipping bert/encoder/layer_6/attention/output/LayerNorm/gamma/adam_m\r\n",
      "Skipping bert/encoder/layer_6/attention/output/LayerNorm/gamma/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_6', 'attention', 'output', 'dense', 'bias']\r\n",
      "Skipping bert/encoder/layer_6/attention/output/dense/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_6/attention/output/dense/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_6', 'attention', 'output', 'dense', 'kernel']\r\n",
      "Skipping bert/encoder/layer_6/attention/output/dense/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_6/attention/output/dense/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_6', 'attention', 'self', 'key', 'bias']\r\n",
      "Skipping bert/encoder/layer_6/attention/self/key/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_6/attention/self/key/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_6', 'attention', 'self', 'key', 'kernel']\r\n",
      "Skipping bert/encoder/layer_6/attention/self/key/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_6/attention/self/key/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_6', 'attention', 'self', 'query', 'bias']\r\n",
      "Skipping bert/encoder/layer_6/attention/self/query/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_6/attention/self/query/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_6', 'attention', 'self', 'query', 'kernel']\r\n",
      "Skipping bert/encoder/layer_6/attention/self/query/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_6/attention/self/query/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_6', 'attention', 'self', 'value', 'bias']\r\n",
      "Skipping bert/encoder/layer_6/attention/self/value/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_6/attention/self/value/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_6', 'attention', 'self', 'value', 'kernel']\r\n",
      "Skipping bert/encoder/layer_6/attention/self/value/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_6/attention/self/value/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_6', 'intermediate', 'dense', 'bias']\r\n",
      "Skipping bert/encoder/layer_6/intermediate/dense/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_6/intermediate/dense/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_6', 'intermediate', 'dense', 'kernel']\r\n",
      "Skipping bert/encoder/layer_6/intermediate/dense/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_6/intermediate/dense/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_6', 'output', 'LayerNorm', 'beta']\r\n",
      "Skipping bert/encoder/layer_6/output/LayerNorm/beta/adam_m\r\n",
      "Skipping bert/encoder/layer_6/output/LayerNorm/beta/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_6', 'output', 'LayerNorm', 'gamma']\r\n",
      "Skipping bert/encoder/layer_6/output/LayerNorm/gamma/adam_m\r\n",
      "Skipping bert/encoder/layer_6/output/LayerNorm/gamma/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_6', 'output', 'dense', 'bias']\r\n",
      "Skipping bert/encoder/layer_6/output/dense/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_6/output/dense/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_6', 'output', 'dense', 'kernel']\r\n",
      "Skipping bert/encoder/layer_6/output/dense/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_6/output/dense/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_7', 'attention', 'output', 'LayerNorm', 'beta']\r\n",
      "Skipping bert/encoder/layer_7/attention/output/LayerNorm/beta/adam_m\r\n",
      "Skipping bert/encoder/layer_7/attention/output/LayerNorm/beta/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_7', 'attention', 'output', 'LayerNorm', 'gamma']\r\n",
      "Skipping bert/encoder/layer_7/attention/output/LayerNorm/gamma/adam_m\r\n",
      "Skipping bert/encoder/layer_7/attention/output/LayerNorm/gamma/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_7', 'attention', 'output', 'dense', 'bias']\r\n",
      "Skipping bert/encoder/layer_7/attention/output/dense/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_7/attention/output/dense/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_7', 'attention', 'output', 'dense', 'kernel']\r\n",
      "Skipping bert/encoder/layer_7/attention/output/dense/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_7/attention/output/dense/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_7', 'attention', 'self', 'key', 'bias']\r\n",
      "Skipping bert/encoder/layer_7/attention/self/key/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_7/attention/self/key/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_7', 'attention', 'self', 'key', 'kernel']\r\n",
      "Skipping bert/encoder/layer_7/attention/self/key/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_7/attention/self/key/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_7', 'attention', 'self', 'query', 'bias']\r\n",
      "Skipping bert/encoder/layer_7/attention/self/query/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_7/attention/self/query/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_7', 'attention', 'self', 'query', 'kernel']\r\n",
      "Skipping bert/encoder/layer_7/attention/self/query/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_7/attention/self/query/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_7', 'attention', 'self', 'value', 'bias']\r\n",
      "Skipping bert/encoder/layer_7/attention/self/value/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_7/attention/self/value/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_7', 'attention', 'self', 'value', 'kernel']\r\n",
      "Skipping bert/encoder/layer_7/attention/self/value/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_7/attention/self/value/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_7', 'intermediate', 'dense', 'bias']\r\n",
      "Skipping bert/encoder/layer_7/intermediate/dense/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_7/intermediate/dense/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_7', 'intermediate', 'dense', 'kernel']\r\n",
      "Skipping bert/encoder/layer_7/intermediate/dense/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_7/intermediate/dense/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_7', 'output', 'LayerNorm', 'beta']\r\n",
      "Skipping bert/encoder/layer_7/output/LayerNorm/beta/adam_m\r\n",
      "Skipping bert/encoder/layer_7/output/LayerNorm/beta/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_7', 'output', 'LayerNorm', 'gamma']\r\n",
      "Skipping bert/encoder/layer_7/output/LayerNorm/gamma/adam_m\r\n",
      "Skipping bert/encoder/layer_7/output/LayerNorm/gamma/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_7', 'output', 'dense', 'bias']\r\n",
      "Skipping bert/encoder/layer_7/output/dense/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_7/output/dense/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_7', 'output', 'dense', 'kernel']\r\n",
      "Skipping bert/encoder/layer_7/output/dense/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_7/output/dense/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_8', 'attention', 'output', 'LayerNorm', 'beta']\r\n",
      "Skipping bert/encoder/layer_8/attention/output/LayerNorm/beta/adam_m\r\n",
      "Skipping bert/encoder/layer_8/attention/output/LayerNorm/beta/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_8', 'attention', 'output', 'LayerNorm', 'gamma']\r\n",
      "Skipping bert/encoder/layer_8/attention/output/LayerNorm/gamma/adam_m\r\n",
      "Skipping bert/encoder/layer_8/attention/output/LayerNorm/gamma/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_8', 'attention', 'output', 'dense', 'bias']\r\n",
      "Skipping bert/encoder/layer_8/attention/output/dense/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_8/attention/output/dense/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_8', 'attention', 'output', 'dense', 'kernel']\r\n",
      "Skipping bert/encoder/layer_8/attention/output/dense/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_8/attention/output/dense/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_8', 'attention', 'self', 'key', 'bias']\r\n",
      "Skipping bert/encoder/layer_8/attention/self/key/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_8/attention/self/key/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_8', 'attention', 'self', 'key', 'kernel']\r\n",
      "Skipping bert/encoder/layer_8/attention/self/key/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_8/attention/self/key/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_8', 'attention', 'self', 'query', 'bias']\r\n",
      "Skipping bert/encoder/layer_8/attention/self/query/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_8/attention/self/query/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_8', 'attention', 'self', 'query', 'kernel']\r\n",
      "Skipping bert/encoder/layer_8/attention/self/query/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_8/attention/self/query/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_8', 'attention', 'self', 'value', 'bias']\r\n",
      "Skipping bert/encoder/layer_8/attention/self/value/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_8/attention/self/value/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_8', 'attention', 'self', 'value', 'kernel']\r\n",
      "Skipping bert/encoder/layer_8/attention/self/value/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_8/attention/self/value/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_8', 'intermediate', 'dense', 'bias']\r\n",
      "Skipping bert/encoder/layer_8/intermediate/dense/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_8/intermediate/dense/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_8', 'intermediate', 'dense', 'kernel']\r\n",
      "Skipping bert/encoder/layer_8/intermediate/dense/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_8/intermediate/dense/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_8', 'output', 'LayerNorm', 'beta']\r\n",
      "Skipping bert/encoder/layer_8/output/LayerNorm/beta/adam_m\r\n",
      "Skipping bert/encoder/layer_8/output/LayerNorm/beta/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_8', 'output', 'LayerNorm', 'gamma']\r\n",
      "Skipping bert/encoder/layer_8/output/LayerNorm/gamma/adam_m\r\n",
      "Skipping bert/encoder/layer_8/output/LayerNorm/gamma/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_8', 'output', 'dense', 'bias']\r\n",
      "Skipping bert/encoder/layer_8/output/dense/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_8/output/dense/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_8', 'output', 'dense', 'kernel']\r\n",
      "Skipping bert/encoder/layer_8/output/dense/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_8/output/dense/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_9', 'attention', 'output', 'LayerNorm', 'beta']\r\n",
      "Skipping bert/encoder/layer_9/attention/output/LayerNorm/beta/adam_m\r\n",
      "Skipping bert/encoder/layer_9/attention/output/LayerNorm/beta/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_9', 'attention', 'output', 'LayerNorm', 'gamma']\r\n",
      "Skipping bert/encoder/layer_9/attention/output/LayerNorm/gamma/adam_m\r\n",
      "Skipping bert/encoder/layer_9/attention/output/LayerNorm/gamma/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_9', 'attention', 'output', 'dense', 'bias']\r\n",
      "Skipping bert/encoder/layer_9/attention/output/dense/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_9/attention/output/dense/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_9', 'attention', 'output', 'dense', 'kernel']\r\n",
      "Skipping bert/encoder/layer_9/attention/output/dense/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_9/attention/output/dense/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_9', 'attention', 'self', 'key', 'bias']\r\n",
      "Skipping bert/encoder/layer_9/attention/self/key/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_9/attention/self/key/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_9', 'attention', 'self', 'key', 'kernel']\r\n",
      "Skipping bert/encoder/layer_9/attention/self/key/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_9/attention/self/key/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_9', 'attention', 'self', 'query', 'bias']\r\n",
      "Skipping bert/encoder/layer_9/attention/self/query/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_9/attention/self/query/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_9', 'attention', 'self', 'query', 'kernel']\r\n",
      "Skipping bert/encoder/layer_9/attention/self/query/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_9/attention/self/query/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_9', 'attention', 'self', 'value', 'bias']\r\n",
      "Skipping bert/encoder/layer_9/attention/self/value/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_9/attention/self/value/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_9', 'attention', 'self', 'value', 'kernel']\r\n",
      "Skipping bert/encoder/layer_9/attention/self/value/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_9/attention/self/value/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_9', 'intermediate', 'dense', 'bias']\r\n",
      "Skipping bert/encoder/layer_9/intermediate/dense/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_9/intermediate/dense/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_9', 'intermediate', 'dense', 'kernel']\r\n",
      "Skipping bert/encoder/layer_9/intermediate/dense/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_9/intermediate/dense/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_9', 'output', 'LayerNorm', 'beta']\r\n",
      "Skipping bert/encoder/layer_9/output/LayerNorm/beta/adam_m\r\n",
      "Skipping bert/encoder/layer_9/output/LayerNorm/beta/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_9', 'output', 'LayerNorm', 'gamma']\r\n",
      "Skipping bert/encoder/layer_9/output/LayerNorm/gamma/adam_m\r\n",
      "Skipping bert/encoder/layer_9/output/LayerNorm/gamma/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_9', 'output', 'dense', 'bias']\r\n",
      "Skipping bert/encoder/layer_9/output/dense/bias/adam_m\r\n",
      "Skipping bert/encoder/layer_9/output/dense/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_9', 'output', 'dense', 'kernel']\r\n",
      "Skipping bert/encoder/layer_9/output/dense/kernel/adam_m\r\n",
      "Skipping bert/encoder/layer_9/output/dense/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'pooler', 'dense', 'bias']\r\n",
      "Skipping bert/pooler/dense/bias/adam_m\r\n",
      "Skipping bert/pooler/dense/bias/adam_v\r\n",
      "Initialize PyTorch weight ['bert', 'pooler', 'dense', 'kernel']\r\n",
      "Skipping bert/pooler/dense/kernel/adam_m\r\n",
      "Skipping bert/pooler/dense/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['cls', 'predictions', 'output_bias']\r\n",
      "Skipping cls/predictions/output_bias/adam_m\r\n",
      "Skipping cls/predictions/output_bias/adam_v\r\n",
      "Initialize PyTorch weight ['cls', 'predictions', 'transform', 'LayerNorm', 'beta']\r\n",
      "Skipping cls/predictions/transform/LayerNorm/beta/adam_m\r\n",
      "Skipping cls/predictions/transform/LayerNorm/beta/adam_v\r\n",
      "Initialize PyTorch weight ['cls', 'predictions', 'transform', 'LayerNorm', 'gamma']\r\n",
      "Skipping cls/predictions/transform/LayerNorm/gamma/adam_m\r\n",
      "Skipping cls/predictions/transform/LayerNorm/gamma/adam_v\r\n",
      "Initialize PyTorch weight ['cls', 'predictions', 'transform', 'dense', 'bias']\r\n",
      "Skipping cls/predictions/transform/dense/bias/adam_m\r\n",
      "Skipping cls/predictions/transform/dense/bias/adam_v\r\n",
      "Initialize PyTorch weight ['cls', 'predictions', 'transform', 'dense', 'kernel']\r\n",
      "Skipping cls/predictions/transform/dense/kernel/adam_m\r\n",
      "Skipping cls/predictions/transform/dense/kernel/adam_v\r\n",
      "Initialize PyTorch weight ['cls', 'seq_relationship', 'output_bias']\r\n",
      "Skipping cls/seq_relationship/output_bias/adam_m\r\n",
      "Skipping cls/seq_relationship/output_bias/adam_v\r\n",
      "Initialize PyTorch weight ['cls', 'seq_relationship', 'output_weights']\r\n",
      "Skipping cls/seq_relationship/output_weights/adam_m\r\n",
      "Skipping cls/seq_relationship/output_weights/adam_v\r\n",
      "Skipping global_step\r\n",
      "Save PyTorch model to bert-base-bahasa-standard-cased/pytorch_model.bin\r\n"
     ]
    }
   ],
   "source": [
    "!transformers-cli convert --model_type bert \\\n",
    "  --tf_checkpoint bert-base/model.ckpt-500000 \\\n",
    "  --config BASE_config.json \\\n",
    "  --pytorch_dump_output bert-base-bahasa-standard-cased/pytorch_model.bin"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "config = BertConfig(f'{directory}/config.json')\n",
    "config.vocab_size = 32000"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Some weights of the model checkpoint at ./bert-base-bahasa-standard-cased/pytorch_model.bin were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']\n",
      "- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).\n",
      "- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
     ]
    }
   ],
   "source": [
    "model = BertForMaskedLM.from_pretrained('./bert-base-bahasa-standard-cased/pytorch_model.bin', \n",
    "                                            config = config)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'sequence': '[CLS] makan ayam dengan ayam [SEP]',\n",
       "  'score': 0.16134978830814362,\n",
       "  'token': 5838,\n",
       "  'token_str': 'ayam'},\n",
       " {'sequence': '[CLS] makan ayam dengan itik [SEP]',\n",
       "  'score': 0.054859865456819534,\n",
       "  'token': 27174,\n",
       "  'token_str': 'itik'},\n",
       " {'sequence': '[CLS] makan ayam dengan telur [SEP]',\n",
       "  'score': 0.028192562982439995,\n",
       "  'token': 8900,\n",
       "  'token_str': 'telur'},\n",
       " {'sequence': '[CLS] makan ayam dengan sos [SEP]',\n",
       "  'score': 0.025005826726555824,\n",
       "  'token': 18840,\n",
       "  'token_str': 'sos'},\n",
       " {'sequence': '[CLS] makan ayam dengan sambal [SEP]',\n",
       "  'score': 0.02045358717441559,\n",
       "  'token': 24098,\n",
       "  'token_str': 'sambal'}]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fill_mask('makan ayam dengan [MASK]')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'sequence': '[CLS] mahathir sebenarnya sangat mencintai tanah airnya [SEP]',\n",
       "  'score': 0.09542842209339142,\n",
       "  'token': 14727,\n",
       "  'token_str': 'mencintai'},\n",
       " {'sequence': '[CLS] mahathir sebenarnya sangat cintakan tanah airnya [SEP]',\n",
       "  'score': 0.06102743372321129,\n",
       "  'token': 24174,\n",
       "  'token_str': 'cintakan'},\n",
       " {'sequence': '[CLS] mahathir sebenarnya sangat suka tanah airnya [SEP]',\n",
       "  'score': 0.05743848904967308,\n",
       "  'token': 3085,\n",
       "  'token_str': 'suka'},\n",
       " {'sequence': '[CLS] mahathir sebenarnya sangat sayangkan tanah airnya [SEP]',\n",
       "  'score': 0.04153375327587128,\n",
       "  'token': 22562,\n",
       "  'token_str': 'sayangkan'},\n",
       " {'sequence': '[CLS] mahathir sebenarnya sangat luas tanah airnya [SEP]',\n",
       "  'score': 0.033663298934698105,\n",
       "  'token': 6425,\n",
       "  'token_str': 'luas'}]"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fill_mask('mahathir sebenarnya sangat [MASK] tanah airnya')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.save_pretrained('bert-base-bahasa-standard-cased')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!transformers-cli upload ./bert-base-bahasa-standard-cased"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import BertTokenizer, BertModel"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!rm -rf /home/husein/.cache/torch/transformers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = BertModel.from_pretrained('huseinzol05/bert-base-bahasa-standard-cased')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = BertTokenizer.from_pretrained('huseinzol05/bert-base-bahasa-standard-cased', lower_case = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "bert-base-bahasa-standard-cased/config.json                      2020-11-03T10:33:44.000Z \"6a664b82917ba8837359f1a37dafaef7\"        468 \n",
    "bert-base-bahasa-standard-cased/pytorch_model.bin                2020-11-03T10:32:47.000Z \"9bb126cc57c6ab01e3f82e4edea5b000\"  445016569 \n",
    "bert-base-bahasa-standard-cased/special_tokens_map.json          2020-11-03T10:33:42.000Z \"8b3fb1023167bb4ab9d70708eb05f6ec\"        112 \n",
    "bert-base-bahasa-standard-cased/tokenizer_config.json            2020-11-03T10:33:40.000Z \"8b3fb1023167bb4ab9d70708eb05f6ec\"        112 \n",
    "bert-base-bahasa-standard-cased/vocab.txt                        2020-11-03T10:33:47.000Z \"8fe213fb6e44f19874592d31fa3655b0\"     224153 "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Done\r\n"
     ]
    }
   ],
   "source": [
    "!transformers-cli s3 rm bert-base-bahasa-standard-cased/vocab.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !transformers-cli upload ./bert-base-bahasa-standard-cased"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
